# In order to run the code, uncomment this chunk and install required packages

# install.packages('readxl')
# install.packages('caret')
# install.packages('corrplot')
# install.packages('outliers')
# install.packages('e1071')
# install.packages("randomForest")
# install.packages("rpart.plot")
# install.packages("xgboost")
# install.packages("SHAPforxgboost")
# install.packages("factoextra")
# install.packages("NbClust")
# install.packages("plotly")
library("readxl")
library(corrplot)
library(outliers)
library(caret)
library(e1071)
library("randomForest")
library(rpart)
library(rpart.plot)
library(xgboost)
library("SHAPforxgboost")
library(factoextra)
library(NbClust)
library(clValid)
library(dplyr)
library(plotly)
# Loading data
data <- read_excel("ChurnData.xlsx")
New names:
• `` -> `...18`
head(data)
# Removing NA values
data <- data[,colSums(is.na(data))<nrow(data)]
data <- na.omit(data)
# Removing unnecessary column Customer
data <- subset(data, select = -Customer)
# Removing spaces in column names
colnames(data) <- make.names(names(data))
# Removing 0s in Account.Length column
data[data$Account.Length == 0, ] <- 1
hist(data$Customer.Service.Calls,
    main = "Customer service calls distribution",
    xlab = "Customer service calls, calls",
    xlim = c(0, max(data$Customer.Service.Calls)),
    col = "dodgerblue3",
    breaks = 10)

axis(side=1, at=seq(0, 9, 1))

# Churners VS Non churners distribution
num_of_churners_non_churners <- c(nrow(data[data$Churn==1, ]),nrow(data[data$Churn==0, ]))
percentage <- round(num_of_churners_non_churners * 100 / sum(num_of_churners_non_churners), 1)
colors <- c("dodgerblue1","dodgerblue4")

pie(num_of_churners_non_churners, 
    main = "Churners VS Non churners distribution",
    labels = percentage,
    col = colors)

legend("topright", c("Churners","Non churners"), cex = 1, fill = colors)

par(mfrow=c(3,1))
# Income boxplot
boxplot(data$Income,
        scientific = FALSE,
        horizontal = TRUE,
        ylim = c(0, max(data$Income)),
        main = "Income distribution",
        xlab = "Annual Income, $",
        col="dodgerblue3")
# Account Length boxplot
boxplot(data$Account.Length,
        scientific = FALSE,
        horizontal = TRUE,
        ylim = c(0, max(data$Account.Length)),
        main = "Account Length distribution",
        xlab = "Account Length, months",
        col="dodgerblue3")
# Voice Mail Messages boxplot
boxplot(data$Voice.Mail.Messages,
        scientific = FALSE,
        horizontal = TRUE,
        ylim = c(0, max(data$Voice.Mail.Messages)),
        main = "Voice Mail Messages distribution",
        xlab = "Voice Mail Messages, messages",
        col="dodgerblue3")

par(mfrow=c(3,1))
# Day Minutes boxplot
boxplot(data$Day.Minutes,
        scientific = FALSE,
        horizontal = TRUE,
        ylim = c(0, max(data$Day.Minutes)),
        main = "Day Minutes distribution",
        xlab = "Day Minutes, min",
        col="dodgerblue3")
# Evening Minutes boxplot
boxplot(data$Evening.Minutes,
        scientific = FALSE,
        horizontal = TRUE,
        ylim = c(0, max(data$Evening.Minutes)),
        main = "Evening Minutes distribution",
        xlab = "Evening Minutes, min",
        col="dodgerblue3")
# Night Minutes boxplot
boxplot(data$Night.Minutes,
        scientific = FALSE,
        horizontal = TRUE,
        ylim = c(0, max(data$Night.Minutes)),
        main = "Night Minutes distribution",
        xlab = "Night Minutes, min",
        col="dodgerblue3")

# International Minutes boxplot
boxplot(data$International.Minutes,
        scientific = FALSE,
        horizontal = TRUE,
        ylim = c(0, max(data$International.Minutes)),
        main = "International Minutes distribution",
        xlab = "International Minutes, min",
        col="dodgerblue3")

data_out <- data[, c("Income", "Account.Length", "Day.Minutes", "Day.Calls", "Evening.Minutes", "Evening.Calls", "Night.Minutes", "Night.Calls", "Customer.Service.Calls")]

# Filtering out outliers using z-score method
z_scores <- abs(scale(data_out))

not_outliers <- which(!rowSums(z_scores>3))

data <- data[rownames(data) %in% not_outliers, ]
# Building the correlation plot to see if there is correlation between columns
corrplot(cor(data),
         method = "circle",
         type = "lower",
         tl.col = "black",
         tl.cex = 0.6,
         bg = "white",
         mar=c(0,0,2,0))

# Calculating monthly day, evening and night calls durations
data$dcall_avg <- round(data$Day.Minutes / data$Account.Length, 3)
data$ecall_avg <- round(data$Evening.Minutes / data$Account.Length, 3)
data$ncall_avg <- round(data$Night.Minutes / data$Account.Length, 3)
# Calculating average monthly day, evening and night calls durations for each plan type
dcalls_monthly_avg <- c(mean(data[data$Plan.Type==1, ]$dcall_avg), mean(data[data$Plan.Type==2, ]$dcall_avg), mean(data[data$Plan.Type==3, ]$dcall_avg), mean(data[data$Plan.Type==4, ]$dcall_avg), mean(data[data$Plan.Type==5, ]$dcall_avg), mean(data[data$Plan.Type==6, ]$dcall_avg), mean(data[data$Plan.Type==7, ]$dcall_avg), mean(data[data$Plan.Type==8, ]$dcall_avg), mean(data[data$Plan.Type==9, ]$dcall_avg), mean(data[data$Plan.Type==10, ]$dcall_avg))

ecalls_monthly_avg <- c(mean(data[data$Plan.Type==1, ]$ecall_avg), mean(data[data$Plan.Type==2, ]$ecall_avg), mean(data[data$Plan.Type==3, ]$ecall_avg), mean(data[data$Plan.Type==4, ]$ecall_avg), mean(data[data$Plan.Type==5, ]$ecall_avg), mean(data[data$Plan.Type==6, ]$ecall_avg), mean(data[data$Plan.Type==7, ]$ecall_avg), mean(data[data$Plan.Type==8, ]$ecall_avg), mean(data[data$Plan.Type==9, ]$ecall_avg), mean(data[data$Plan.Type==10, ]$ecall_avg))

ncalls_monthly_avg <- c(mean(data[data$Plan.Type==1, ]$ncall_avg), mean(data[data$Plan.Type==2, ]$ncall_avg), mean(data[data$Plan.Type==3, ]$ncall_avg), mean(data[data$Plan.Type==4, ]$ncall_avg), mean(data[data$Plan.Type==5, ]$ncall_avg), mean(data[data$Plan.Type==6, ]$ncall_avg), mean(data[data$Plan.Type==7, ]$ncall_avg), mean(data[data$Plan.Type==8, ]$ncall_avg), mean(data[data$Plan.Type==9, ]$ncall_avg), mean(data[data$Plan.Type==10, ]$ncall_avg))
# Visualizing average monthly day, evening and night calls durations for each plan type
plan_types <- c("1","2","3","4","5","6","7","8","9","10")
colors <- c("dodgerblue1","dodgerblue3","dodgerblue4")
day_time <- c("Day","Evening","Night")

calls_monthly_avg <- matrix(c(dcalls_monthly_avg, ecalls_monthly_avg, ncalls_monthly_avg), nrow = 3, ncol = 10, byrow = TRUE)

barplot(calls_monthly_avg, 
        main = "Average monthly calls duration by plan type", 
        names.arg = plan_types, 
        xlab = "Plan type", 
        ylab = "Average monthly calls duration, min", 
        col = colors)

legend("topright", day_time, cex = 0.6, fill = colors)

# Stratified random split
set.seed(123)
split_index <- createDataPartition(data$Churn, p = .7, list = FALSE, times = 1)
train <- data[split_index,]
test <- data[-split_index,]

train$Churn <- as.factor(train$Churn)
test$Churn <- as.factor(test$Churn)
# Creating additional variables to get rid of multicorrelation
data$dcall_dur_avg <- round(data$Day.Minutes / data$Day.Calls, 3)
data$ecall_dur_avg <- round(data$Evening.Minutes / data$Day.Calls, 3)
data$ncall_dur_avg <- round(data$Night.Minutes / data$Day.Calls, 3)
# Selecting features that will not be used for analysis
dropped_columns <- c("Voice.Mail","Day.Calls","Evening.Calls","Night.Calls","International.Plan.YES","International.Calls","Has.Phone","Phone.Payment.left","dcall_avg","ecall_avg","ncall_avg","dcall_dur_avg","ecall_dur_avg","ncall_dur_avg")

# Dropping unnecessary columns
train <- train[,!(names(train) %in% dropped_columns)]
test <- test[,!(names(test) %in% dropped_columns)]

# Creating dataframe that will later be used to build an ensamble algorithm
data_ens <- data[,!(names(data) %in% dropped_columns)]
# This formula will be used for all models
formula <- 'Churn ~ Income + Gender + Account.Length + Plan.Type + Voice.Mail.Messages + Day.Minutes + Evening.Minutes + Night.Minutes + International.Minutes + Phone.monthly.payment + Customer.Service.Calls'
# Logistic regression
logit <- glm(as.formula(formula),family=binomial(link='logit'),data=train)
summary(logit)

Call:
glm(formula = as.formula(formula), family = binomial(link = "logit"), 
    data = train)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.7924  -0.5024  -0.3083  -0.1711   2.9670  

Coefficients:
                         Estimate Std. Error z value Pr(>|z|)    
(Intercept)            -9.497e+00  8.894e-01 -10.678  < 2e-16 ***
Income                  5.371e-06  3.475e-06   1.546  0.12218    
Gender                 -9.708e-02  1.845e-01  -0.526  0.59880    
Account.Length         -2.052e-03  6.452e-03  -0.318  0.75043    
Plan.Type              -7.364e-03  3.196e-02  -0.230  0.81775    
Voice.Mail.Messages    -2.268e-02  7.954e-03  -2.851  0.00436 ** 
Day.Minutes             1.952e-02  1.942e-03  10.053  < 2e-16 ***
Evening.Minutes         8.412e-03  1.987e-03   4.232 2.31e-05 ***
Night.Minutes           4.695e-03  1.891e-03   2.482  0.01305 *  
International.Minutes   1.434e-01  1.853e-02   7.736 1.02e-14 ***
Phone.monthly.payment  -8.578e-03  4.287e-03  -2.001  0.04540 *  
Customer.Service.Calls  4.353e-01  7.082e-02   6.147 7.90e-10 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1019.99  on 1300  degrees of freedom
Residual deviance:  787.78  on 1289  degrees of freedom
AIC: 811.78

Number of Fisher Scoring iterations: 6
# Trained logistic regression predictions
model_fit <- predict(logit, test, type = 'response')
logit_pred <- ifelse(model_fit > 0.5,1,0)
# Confusion matrix for logistic regression
logit_conf <- confusionMatrix(as.factor(logit_pred), test$Churn, mode = "everything", positive = "1")
logit_conf
Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 475  56
         1  10  16
                                          
               Accuracy : 0.8815          
                 95% CI : (0.8517, 0.9072)
    No Information Rate : 0.8707          
    P-Value [Acc > NIR] : 0.2462          
                                          
                  Kappa : 0.2769          
                                          
 Mcnemar's Test P-Value : 3.04e-08        
                                          
            Sensitivity : 0.22222         
            Specificity : 0.97938         
         Pos Pred Value : 0.61538         
         Neg Pred Value : 0.89454         
              Precision : 0.61538         
                 Recall : 0.22222         
                     F1 : 0.32653         
             Prevalence : 0.12926         
         Detection Rate : 0.02873         
   Detection Prevalence : 0.04668         
      Balanced Accuracy : 0.60080         
                                          
       'Positive' Class : 1               
                                          
logit_f <- logit_conf$byClass[7]
# Algorithm to tune parameters of support vector machines
# 
# tune_out <- tune.svm(x = train[, -12], y = train$Churn,
#               type = "C-classification",
#               kernel = "polynomial", degree = 2, cost = 10^(-1:1),
#               gamma = c(0.1, 0.5, 1), coef0 = c(0.1, 0.5, 1))
# 
# 
# Obtained optimal values will further be used in svm
# tune_out$best.parameters$cost
# tune_out$best.parameters$gamma
# tune_out$best.parameters$coef0
set.seed(123)
# Support vector machines
svm_m = svm(formula = as.formula(formula), data = train, scale = TRUE, type = 'C-classification',  kernel = "polynomial", degree = 2, cost = 0.1,  gamma = 1, coef0 = 0.5)
set.seed(123)
# Support vector machines predictions
svm_p <- predict(svm_m, newdata = test)
svm_conf <- confusionMatrix(svm_p, test$Churn, mode = "everything", positive = "1")
svm_conf
Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 478  37
         1   7  35
                                         
               Accuracy : 0.921          
                 95% CI : (0.8954, 0.942)
    No Information Rate : 0.8707         
    P-Value [Acc > NIR] : 0.0001144      
                                         
                  Kappa : 0.5734         
                                         
 Mcnemar's Test P-Value : 1.232e-05      
                                         
            Sensitivity : 0.48611        
            Specificity : 0.98557        
         Pos Pred Value : 0.83333        
         Neg Pred Value : 0.92816        
              Precision : 0.83333        
                 Recall : 0.48611        
                     F1 : 0.61404        
             Prevalence : 0.12926        
         Detection Rate : 0.06284        
   Detection Prevalence : 0.07540        
      Balanced Accuracy : 0.73584        
                                         
       'Positive' Class : 1              
                                         
svm_f <- svm_conf$byClass[7]
# Tuning parameter mtry for random forest
set.seed(123)
train_rf <- as.data.frame(train)
bestmtry <- tuneRF(train_rf[, -12], train_rf[,12], stepFactor=1.5, improve=1e-5, ntree=500)
mtry = 3  OOB error = 8.46% 
Searching left ...
mtry = 2    OOB error = 8.99% 
-0.06363636 1e-05 
Searching right ...
mtry = 4    OOB error = 8.69% 
-0.02727273 1e-05 

print(bestmtry)
      mtry   OOBError
2.OOB    2 0.08993082
3.OOB    3 0.08455035
4.OOB    4 0.08685626
set.seed(123)
# Random forest
randomf = randomForest(as.formula(formula), ntree = 500,  mtry = 3, data = train)
set.seed(123)
# Random forest predictions
y_pred = predict(randomf, newdata = test[-12])
randf_conf <- confusionMatrix(y_pred, test$Churn, mode = "everything", positive = "1")
importance(randomf)
                       MeanDecreaseGini
Income                        21.051915
Gender                         3.779393
Account.Length                17.793486
Plan.Type                     12.362275
Voice.Mail.Messages           10.860191
Day.Minutes                   96.341691
Evening.Minutes               36.883533
Night.Minutes                 26.307368
International.Minutes         26.356792
Phone.monthly.payment         15.413500
Customer.Service.Calls        32.591108
randf_conf
Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 479  32
         1   6  40
                                          
               Accuracy : 0.9318          
                 95% CI : (0.9076, 0.9513)
    No Information Rate : 0.8707          
    P-Value [Acc > NIR] : 2.423e-06       
                                          
                  Kappa : 0.6419          
                                          
 Mcnemar's Test P-Value : 5.002e-05       
                                          
            Sensitivity : 0.55556         
            Specificity : 0.98763         
         Pos Pred Value : 0.86957         
         Neg Pred Value : 0.93738         
              Precision : 0.86957         
                 Recall : 0.55556         
                     F1 : 0.67797         
             Prevalence : 0.12926         
         Detection Rate : 0.07181         
   Detection Prevalence : 0.08259         
      Balanced Accuracy : 0.77159         
                                          
       'Positive' Class : 1               
                                          
randf_f <- randf_conf$byClass[7]
set.seed(123)
# Simple decision treee
dtree <- rpart(formula = as.formula(formula), data = train, cp=0.01)
# Choosing optimal complexity parameter (cp) using plot
printcp(dtree)

Classification tree:
rpart(formula = as.formula(formula), data = train, cp = 0.01)

Variables actually used in tree construction:
[1] Customer.Service.Calls Day.Minutes            Evening.Minutes        International.Minutes 
[5] Phone.monthly.payment  Voice.Mail.Messages   

Root node error: 173/1301 = 0.13297

n= 1301 

        CP nsplit rel error  xerror     xstd
1 0.167630      0   1.00000 1.00000 0.070793
2 0.063584      1   0.83237 0.87283 0.066781
3 0.057803      3   0.70520 0.82659 0.065214
4 0.023121      4   0.64740 0.74566 0.062312
5 0.011561     10   0.50867 0.80925 0.064609
6 0.010000     11   0.49711 0.79191 0.063996
plotcp(dtree)

rpart.plot(dtree)

set.seed(123)
# Devision tree predictions
tree_pred <- predict(dtree, test, type = 'class')

dtree_conf <- confusionMatrix(tree_pred, test$Churn,  mode = "everything", positive = "1")
dtree_conf
Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 473  30
         1  12  42
                                          
               Accuracy : 0.9246          
                 95% CI : (0.8994, 0.9451)
    No Information Rate : 0.8707          
    P-Value [Acc > NIR] : 3.499e-05       
                                          
                  Kappa : 0.6251          
                                          
 Mcnemar's Test P-Value : 0.008712        
                                          
            Sensitivity : 0.58333         
            Specificity : 0.97526         
         Pos Pred Value : 0.77778         
         Neg Pred Value : 0.94036         
              Precision : 0.77778         
                 Recall : 0.58333         
                     F1 : 0.66667         
             Prevalence : 0.12926         
         Detection Rate : 0.07540         
   Detection Prevalence : 0.09695         
      Balanced Accuracy : 0.77930         
                                          
       'Positive' Class : 1               
                                          
dtree_f <- dtree_conf$byClass[7]
# Preparing data for xgboost algorithm
xgb_train_churn <- as.matrix(train[12])
xgb_train <- data.matrix(train[-12])

xgb_test_churn <- as.matrix(test[12])
xgb_test <- data.matrix(test[-12])

xgb_train_m = xgb.DMatrix(data=xgb_train, label=xgb_train_churn)
xgb_test_m = xgb.DMatrix(data=xgb_test, label=xgb_test_churn)
params <- list(booster = "gbtree", objective = "binary:logistic", eta=0.3, gamma=0, max_depth=6, min_child_weight=1, subsample=1, colsample_bytree=1)
# Extracting optimal features for xgboost using cross validation
xgbcv <- xgb.cv( params = params, data = xgb_train_m, nrounds = 100, nfold = 5, showsd = T, stratified = T, print.every.n = 10, early.stop.round = 20, maximize = T)
Warning: 'print.every.n' is deprecated.
Use 'print_every_n' instead.
See help("Deprecated") and help("xgboost-deprecated").
Warning: 'early.stop.round' is deprecated.
Use 'early_stopping_rounds' instead.
See help("Deprecated") and help("xgboost-deprecated").
[1] train-logloss:0.490664+0.002389 test-logloss:0.514700+0.002804 
Multiple eval metrics are present. Will use test_logloss for early stopping.
Will train until test_logloss hasn't improved in 20 rounds.

[11]    train-logloss:0.107408+0.004752 test-logloss:0.237669+0.016557 
[21]    train-logloss:0.060796+0.003263 test-logloss:0.240536+0.027036 
Stopping. Best iteration:
[1] train-logloss:0.490664+0.002389 test-logloss:0.514700+0.002804
# XGBoost algorithm
set.seed(123)
xgb <- xgboost(data = xgb_train_m, params=params, nrounds=41)
[1] train-logloss:0.492093 
[2] train-logloss:0.374901 
[3] train-logloss:0.302414 
[4] train-logloss:0.251825 
[5] train-logloss:0.214501 
[6] train-logloss:0.184347 
[7] train-logloss:0.161494 
[8] train-logloss:0.145188 
[9] train-logloss:0.129774 
[10]    train-logloss:0.119050 
[11]    train-logloss:0.109443 
[12]    train-logloss:0.101285 
[13]    train-logloss:0.094453 
[14]    train-logloss:0.086692 
[15]    train-logloss:0.084022 
[16]    train-logloss:0.081549 
[17]    train-logloss:0.077827 
[18]    train-logloss:0.072342 
[19]    train-logloss:0.069526 
[20]    train-logloss:0.066059 
[21]    train-logloss:0.062552 
[22]    train-logloss:0.061327 
[23]    train-logloss:0.059917 
[24]    train-logloss:0.057204 
[25]    train-logloss:0.054538 
[26]    train-logloss:0.053387 
[27]    train-logloss:0.051132 
[28]    train-logloss:0.049806 
[29]    train-logloss:0.046619 
[30]    train-logloss:0.045243 
[31]    train-logloss:0.042654 
[32]    train-logloss:0.041777 
[33]    train-logloss:0.040314 
[34]    train-logloss:0.039193 
[35]    train-logloss:0.038492 
[36]    train-logloss:0.037844 
[37]    train-logloss:0.037258 
[38]    train-logloss:0.035513 
[39]    train-logloss:0.034067 
[40]    train-logloss:0.033318 
[41]    train-logloss:0.032010 
set.seed(123)
# xgboost predictions
xgb_p = predict(xgb, xgb_test_m)
xgb_p = as.factor(round(xgb_p))

xgb_conf <- confusionMatrix(xgb_p, test$Churn,  mode = "everything", positive = "1")
xgb_conf
Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 474  25
         1  11  47
                                          
               Accuracy : 0.9354          
                 95% CI : (0.9116, 0.9543)
    No Information Rate : 0.8707          
    P-Value [Acc > NIR] : 5.43e-07        
                                          
                  Kappa : 0.687           
                                          
 Mcnemar's Test P-Value : 0.03026         
                                          
            Sensitivity : 0.65278         
            Specificity : 0.97732         
         Pos Pred Value : 0.81034         
         Neg Pred Value : 0.94990         
              Precision : 0.81034         
                 Recall : 0.65278         
                     F1 : 0.72308         
             Prevalence : 0.12926         
         Detection Rate : 0.08438         
   Detection Prevalence : 0.10413         
      Balanced Accuracy : 0.81505         
                                          
       'Positive' Class : 1               
                                          
xgb_f <- xgb_conf$byClass[7]
# Importance of features
xgb.importance(model=xgb)

# SHaP values for each feature
shap <- shap.prep(xgb_model = xgb, X_train = xgb_train)
shap.plot.summary(shap)

shap.plot.dependence(shap, "Day.Minutes", alpha = 0.7, jitter_width = 0.1)
`geom_smooth()` using formula 'y ~ x'

shap.plot.dependence(shap, "Day.Minutes", color_feature = 'Evening.Minutes', alpha = 0.7, jitter_width = 0.1)
`geom_smooth()` using formula 'y ~ x'

shap.plot.dependence(shap, "Customer.Service.Calls", color_feature = 'Day.Minutes', alpha = 0.7, jitter_width = 0.1)
`geom_smooth()` using formula 'y ~ x'
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,  :
  pseudoinverse used at -0.025
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,  :
  neighborhood radius 2.025
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,  :
  reciprocal condition number  5.302e-15
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,  :
  There are other near singularities as well. 1

shap.plot.dependence(shap, "International.Minutes", color_feature = 'auto', alpha = 0.7, jitter_width = 0.1)
`geom_smooth()` using formula 'y ~ x'
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,  :
  at  -0.0945
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,  :
  radius  0.0089303
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,  :
  all data on boundary of neighborhood. make span bigger
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,  :
  pseudoinverse used at -0.0945
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,  :
  neighborhood radius 0.0945
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,  :
  reciprocal condition number  1
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,  :
  zero-width neighborhood. make span bigger
Warning: Computation failed in `stat_smooth()`:
NA/NaN/Inf in foreign function call (arg 5)

shap.plot.dependence(shap, "Evening.Minutes", color_feature = 'auto', alpha = 0.7, jitter_width = 0.1)
`geom_smooth()` using formula 'y ~ x'

set.seed(123)
# Predicting churn with each model
log_pred <- predict(logit, newdata = data_ens, type = 'response')
log_pred <- as.factor(ifelse(log_pred > 0.5,1,0))

svm_pred <- predict(svm_m, newdata = data_ens)

randf_pred <- predict(randomf, newdata = data_ens)

dtree_pred <- predict(dtree, newdata = data_ens, type = 'class')

xgb_matrix <- xgb.DMatrix(data=data.matrix(data_ens[,-12]), label=as.matrix(data_ens$Churn))

xgb_pred <- predict(xgb, newdata = xgb_matrix)
xgb_pred <- as.factor(round(xgb_pred))

# Building the dataframe for ensemble method
predictions <- data.frame(log_pred, svm_pred, randf_pred, dtree_pred, xgb_pred)
predictions$ens <- NA
# Assigning scores for each model based on respective F-score
sum_f <- sum(logit_f, svm_f, dtree_f, randf_f, xgb_f)
log_score <- (logit_f / sum_f)
svm_score <- (svm_f / sum_f)
randf_score <- (randf_f / sum_f)
dtree_score <- (dtree_f / sum_f)
xgb_score <- (xgb_f / sum_f)

scores <- c(log_score, svm_score, randf_score, dtree_score,xgb_score)
# Ensemble prediction function
for(i in 1:length(log_pred)){
  y = 0
  n = 0
  for (j in 1:5){
    if (predictions[i, j] == 0){
      n = n + scores[j]
    }else{
      y = y + scores[j]
    }
  }
  if (y > n){
    predictions$ens[i] = 1
  }else{
    predictions$ens[i] = 0
  }
}
# Measuring the performance of an ensemble
ens_conf <- confusionMatrix(as.factor(predictions$ens), as.factor(data_ens$Churn), mode = 'everything', positive = '1')
ens_conf
Confusion Matrix and Statistics

          Reference
Prediction    0    1
         0 1610   81
         1    3  164
                                          
               Accuracy : 0.9548          
                 95% CI : (0.9443, 0.9638)
    No Information Rate : 0.8681          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.7717          
                                          
 Mcnemar's Test P-Value : < 2.2e-16       
                                          
            Sensitivity : 0.66939         
            Specificity : 0.99814         
         Pos Pred Value : 0.98204         
         Neg Pred Value : 0.95210         
              Precision : 0.98204         
                 Recall : 0.66939         
                     F1 : 0.79612         
             Prevalence : 0.13186         
         Detection Rate : 0.08827         
   Detection Prevalence : 0.08988         
      Balanced Accuracy : 0.83376         
                                          
       'Positive' Class : 1               
                                          
ens_f <- ens_conf$byClass[7]

f_scores_all <- c(logit_f, svm_f, dtree_f, randf_f, xgb_f, ens_f)
f_scores_all
       F1        F1        F1        F1        F1        F1 
0.3265306 0.6140351 0.6666667 0.6779661 0.7230769 0.7961165 
# Min max normalization function
minmax_norm <- function(x) {
  (x - min(x)) / (max(x) - min(x))
}
# Dataframe consisting only of people who churned
churn_df <- data_ens[data_ens$Churn==1, ]

churn_df <- churn_df[, c("Day.Minutes", "International.Minutes", "Customer.Service.Calls")]

# churn_df <- as.data.frame(lapply(churn_df, minmax_norm))

# churn_df <- churn_df[,!(names(churn_df) %in% c("Plan.Type", "Gender", "Churn"))]

churn_df <- as.data.frame(scale(churn_df))
# pca_res = prcomp(churn_df, center = TRUE, scale = TRUE)
# summary(pca_res)
# Visualizing elbow method

wss <- sapply(1:10, function(k){kmeans(churn_df, k, nstart=25,iter.max = 50 )$tot.withinss})

plot(1:10, wss,
     type="b", pch = 19, frame = F,
     xlab="Number of clusters K",
     ylab="Total within-clusters sum of squares")

# Defining optimal number of k for k-means algorithm
set.seed(123)
# Silhouette method
fviz_nbclust(churn_df, kmeans, nstart = 25, k.max = 15, iter.max = 50, method = "silhouette")+
  labs(subtitle = "Silhouette method")

# K-means with k=4
k4 <- kmeans(churn_df, centers = 4, iter.max = 50, nstart = 25)
k4
K-means clustering with 4 clusters of sizes 36, 115, 45, 49

Cluster means:
  Day.Minutes International.Minutes Customer.Service.Calls
1  -1.0930388             1.3095389              0.4343530
2   0.6160607            -0.6653610             -0.4337388
3   0.4143949             1.3927979             -0.7551460
4  -1.0233746            -0.6796489              1.3923433

Clustering vector:
  [1] 2 2 1 3 4 2 4 4 2 2 2 4 2 2 3 2 2 3 2 2 1 3 2 1 4 1 2 4 1 2 2 4 2 2 2 2 4 4 2 2 1 3 1 2 3 2 2 2 2 2 1 2
 [53] 3 2 3 1 4 2 2 2 1 2 1 2 4 4 4 3 2 2 2 2 2 2 3 3 4 4 3 4 1 4 3 2 4 2 4 4 1 2 2 2 1 4 3 4 4 1 1 3 3 4 4 3
[105] 2 3 4 2 2 4 2 3 2 2 3 3 4 4 2 3 2 3 4 2 3 4 2 2 2 3 2 3 4 4 2 2 3 2 2 2 2 1 2 2 4 1 2 1 2 2 1 2 2 2 2 2
[157] 3 2 1 2 2 1 1 2 3 4 2 1 2 2 2 2 1 4 4 4 2 2 3 4 2 2 3 2 3 3 2 1 4 1 4 2 3 2 2 4 1 4 2 1 2 2 1 4 1 2 1 2
[209] 2 2 3 4 3 3 3 4 2 4 2 3 2 2 2 2 3 1 3 2 4 1 3 2 3 4 1 2 3 2 2 2 2 1 2 2 3

Within cluster sum of squares by cluster:
[1] 51.23809 90.22534 43.70436 29.47427
 (between_SS / total_SS =  70.7 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss" "betweenss"    "size"        
[8] "iter"         "ifault"      
# Evaluating built clusters with silhouette width
sil <- silhouette(k4$cluster, dist(churn_df))
fviz_silhouette(sil)

churn_df$cluster = factor(k4$cluster)

p <- plot_ly(churn_df, x=~Day.Minutes, y=~International.Minutes, 
z=~Customer.Service.Calls, color=~cluster) %>% add_markers(size=1.5)
print(p)
NULL
---
title: "Developing an ensemble approach for predicting customer churn"
output: html_notebook
author: Nazar Todoshchuk
---

```{r}
# In order to run the code, uncomment this chunk and install required packages

# install.packages('readxl')
# install.packages('caret')
# install.packages('corrplot')
# install.packages('outliers')
# install.packages('e1071')
# install.packages("randomForest")
# install.packages("rpart.plot")
# install.packages("xgboost")
# install.packages("SHAPforxgboost")
# install.packages("factoextra")
# install.packages("NbClust")
# install.packages("plotly")
```

```{r}
library("readxl")
library(corrplot)
library(outliers)
library(caret)
library(e1071)
library("randomForest")
library(rpart)
library(rpart.plot)
library(xgboost)
library("SHAPforxgboost")
library(factoextra)
library(NbClust)
library(clValid)
library(dplyr)
library(plotly)
```

```{r}
# Loading data
data <- read_excel("ChurnData.xlsx")
head(data)
```

```{r}
# Removing NA values
data <- data[,colSums(is.na(data))<nrow(data)]
data <- na.omit(data)
# Removing unnecessary column Customer
data <- subset(data, select = -Customer)
# Removing spaces in column names
colnames(data) <- make.names(names(data))
# Removing 0s in Account.Length column
data[data$Account.Length == 0, ] <- 1
```

```{r}
hist(data$Customer.Service.Calls,
    main = "Customer service calls distribution",
    xlab = "Customer service calls, calls",
    xlim = c(0, max(data$Customer.Service.Calls)),
    col = "dodgerblue3",
    breaks = 10)

axis(side=1, at=seq(0, 9, 1))
```

```{r}
# Churners VS Non churners distribution
num_of_churners_non_churners <- c(nrow(data[data$Churn==1, ]),nrow(data[data$Churn==0, ]))
percentage <- round(num_of_churners_non_churners * 100 / sum(num_of_churners_non_churners), 1)
colors <- c("dodgerblue1","dodgerblue4")

pie(num_of_churners_non_churners, 
    main = "Churners VS Non churners distribution",
    labels = percentage,
    col = colors)

legend("topright", c("Churners","Non churners"), cex = 1, fill = colors)

```

```{r}
par(mfrow=c(3,1))
# Income boxplot
boxplot(data$Income,
        scientific = FALSE,
        horizontal = TRUE,
        ylim = c(0, max(data$Income)),
        main = "Income distribution",
        xlab = "Annual Income, $",
        col="dodgerblue3")
# Account Length boxplot
boxplot(data$Account.Length,
        scientific = FALSE,
        horizontal = TRUE,
        ylim = c(0, max(data$Account.Length)),
        main = "Account Length distribution",
        xlab = "Account Length, months",
        col="dodgerblue3")
# Voice Mail Messages boxplot
boxplot(data$Voice.Mail.Messages,
        scientific = FALSE,
        horizontal = TRUE,
        ylim = c(0, max(data$Voice.Mail.Messages)),
        main = "Voice Mail Messages distribution",
        xlab = "Voice Mail Messages, messages",
        col="dodgerblue3")
```

```{r}
par(mfrow=c(3,1))
# Day Minutes boxplot
boxplot(data$Day.Minutes,
        scientific = FALSE,
        horizontal = TRUE,
        ylim = c(0, max(data$Day.Minutes)),
        main = "Day Minutes distribution",
        xlab = "Day Minutes, min",
        col="dodgerblue3")
# Evening Minutes boxplot
boxplot(data$Evening.Minutes,
        scientific = FALSE,
        horizontal = TRUE,
        ylim = c(0, max(data$Evening.Minutes)),
        main = "Evening Minutes distribution",
        xlab = "Evening Minutes, min",
        col="dodgerblue3")
# Night Minutes boxplot
boxplot(data$Night.Minutes,
        scientific = FALSE,
        horizontal = TRUE,
        ylim = c(0, max(data$Night.Minutes)),
        main = "Night Minutes distribution",
        xlab = "Night Minutes, min",
        col="dodgerblue3")
```

```{r}
# International Minutes boxplot
boxplot(data$International.Minutes,
        scientific = FALSE,
        horizontal = TRUE,
        ylim = c(0, max(data$International.Minutes)),
        main = "International Minutes distribution",
        xlab = "International Minutes, min",
        col="dodgerblue3")
```

```{r}
data_out <- data[, c("Income", "Account.Length", "Day.Minutes", "Day.Calls", "Evening.Minutes", "Evening.Calls", "Night.Minutes", "Night.Calls", "Customer.Service.Calls")]

# Filtering out outliers using z-score method
z_scores <- abs(scale(data_out))

not_outliers <- which(!rowSums(z_scores>3))

data <- data[rownames(data) %in% not_outliers, ]
```

```{r}
# Building the correlation plot to see if there is correlation between columns
corrplot(cor(data),
         method = "circle",
         type = "lower",
         tl.col = "black",
         tl.cex = 0.6,
         bg = "white",
         mar=c(0,0,2,0))
```

```{r}
# Calculating monthly day, evening and night calls durations
data$dcall_avg <- round(data$Day.Minutes / data$Account.Length, 3)
data$ecall_avg <- round(data$Evening.Minutes / data$Account.Length, 3)
data$ncall_avg <- round(data$Night.Minutes / data$Account.Length, 3)
```

```{r}
# Calculating average monthly day, evening and night calls durations for each plan type
dcalls_monthly_avg <- c(mean(data[data$Plan.Type==1, ]$dcall_avg), mean(data[data$Plan.Type==2, ]$dcall_avg), mean(data[data$Plan.Type==3, ]$dcall_avg), mean(data[data$Plan.Type==4, ]$dcall_avg), mean(data[data$Plan.Type==5, ]$dcall_avg), mean(data[data$Plan.Type==6, ]$dcall_avg), mean(data[data$Plan.Type==7, ]$dcall_avg), mean(data[data$Plan.Type==8, ]$dcall_avg), mean(data[data$Plan.Type==9, ]$dcall_avg), mean(data[data$Plan.Type==10, ]$dcall_avg))

ecalls_monthly_avg <- c(mean(data[data$Plan.Type==1, ]$ecall_avg), mean(data[data$Plan.Type==2, ]$ecall_avg), mean(data[data$Plan.Type==3, ]$ecall_avg), mean(data[data$Plan.Type==4, ]$ecall_avg), mean(data[data$Plan.Type==5, ]$ecall_avg), mean(data[data$Plan.Type==6, ]$ecall_avg), mean(data[data$Plan.Type==7, ]$ecall_avg), mean(data[data$Plan.Type==8, ]$ecall_avg), mean(data[data$Plan.Type==9, ]$ecall_avg), mean(data[data$Plan.Type==10, ]$ecall_avg))

ncalls_monthly_avg <- c(mean(data[data$Plan.Type==1, ]$ncall_avg), mean(data[data$Plan.Type==2, ]$ncall_avg), mean(data[data$Plan.Type==3, ]$ncall_avg), mean(data[data$Plan.Type==4, ]$ncall_avg), mean(data[data$Plan.Type==5, ]$ncall_avg), mean(data[data$Plan.Type==6, ]$ncall_avg), mean(data[data$Plan.Type==7, ]$ncall_avg), mean(data[data$Plan.Type==8, ]$ncall_avg), mean(data[data$Plan.Type==9, ]$ncall_avg), mean(data[data$Plan.Type==10, ]$ncall_avg))
```

```{r}
# Visualizing average monthly day, evening and night calls durations for each plan type
plan_types <- c("1","2","3","4","5","6","7","8","9","10")
colors <- c("dodgerblue1","dodgerblue3","dodgerblue4")
day_time <- c("Day","Evening","Night")

calls_monthly_avg <- matrix(c(dcalls_monthly_avg, ecalls_monthly_avg, ncalls_monthly_avg), nrow = 3, ncol = 10, byrow = TRUE)

barplot(calls_monthly_avg, 
        main = "Average monthly calls duration by plan type", 
        names.arg = plan_types, 
        xlab = "Plan type", 
        ylab = "Average monthly calls duration, min", 
        col = colors)

legend("topright", day_time, cex = 0.6, fill = colors)

```

```{r}
# Stratified random split
set.seed(123)
split_index <- createDataPartition(data$Churn, p = .7, list = FALSE, times = 1)
train <- data[split_index,]
test <- data[-split_index,]

train$Churn <- as.factor(train$Churn)
test$Churn <- as.factor(test$Churn)
```

```{r}
# Creating additional variables to get rid of multicorrelation
data$dcall_dur_avg <- round(data$Day.Minutes / data$Day.Calls, 3)
data$ecall_dur_avg <- round(data$Evening.Minutes / data$Day.Calls, 3)
data$ncall_dur_avg <- round(data$Night.Minutes / data$Day.Calls, 3)
```

```{r}
# Selecting features that will not be used for analysis
dropped_columns <- c("Voice.Mail","Day.Calls","Evening.Calls","Night.Calls","International.Plan.YES","International.Calls","Has.Phone","Phone.Payment.left","dcall_avg","ecall_avg","ncall_avg","dcall_dur_avg","ecall_dur_avg","ncall_dur_avg")

# Dropping unnecessary columns
train <- train[,!(names(train) %in% dropped_columns)]
test <- test[,!(names(test) %in% dropped_columns)]

# Creating dataframe that will later be used to build an ensamble algorithm
data_ens <- data[,!(names(data) %in% dropped_columns)]
```

```{r}
# This formula will be used for all models
formula <- 'Churn ~ Income + Gender + Account.Length + Plan.Type + Voice.Mail.Messages + Day.Minutes + Evening.Minutes + Night.Minutes + International.Minutes + Phone.monthly.payment + Customer.Service.Calls'
```

```{r}
# Logistic regression
logit <- glm(as.formula(formula),family=binomial(link='logit'),data=train)
summary(logit)
```

```{r}
# Trained logistic regression predictions
model_fit <- predict(logit, test, type = 'response')
logit_pred <- ifelse(model_fit > 0.5,1,0)
# Confusion matrix for logistic regression
logit_conf <- confusionMatrix(as.factor(logit_pred), test$Churn, mode = "everything", positive = "1")
logit_conf
logit_f <- logit_conf$byClass[7]
```

```{r}
# Algorithm to tune parameters of support vector machines
# 
# tune_out <- tune.svm(x = train[, -12], y = train$Churn,
#               type = "C-classification",
#               kernel = "polynomial", degree = 2, cost = 10^(-1:1),
#               gamma = c(0.1, 0.5, 1), coef0 = c(0.1, 0.5, 1))
# 
# 
# Obtained optimal values will further be used in svm
# tune_out$best.parameters$cost
# tune_out$best.parameters$gamma
# tune_out$best.parameters$coef0
```

```{r}
set.seed(123)
# Support vector machines
svm_m = svm(formula = as.formula(formula), data = train, scale = TRUE, type = 'C-classification',  kernel = "polynomial", degree = 2, cost = 0.1,  gamma = 1, coef0 = 0.5)
```

```{r}
set.seed(123)
# Support vector machines predictions
svm_p <- predict(svm_m, newdata = test)
svm_conf <- confusionMatrix(svm_p, test$Churn, mode = "everything", positive = "1")
svm_conf
svm_f <- svm_conf$byClass[7]
```
```{r}
# Tuning parameter mtry for random forest
set.seed(123)
train_rf <- as.data.frame(train)
bestmtry <- tuneRF(train_rf[, -12], train_rf[,12], stepFactor=1.5, improve=1e-5, ntree=500)
print(bestmtry)
```


```{r}
set.seed(123)
# Random forest
randomf = randomForest(as.formula(formula), ntree = 500,  mtry = 3, data = train)
```

```{r}
set.seed(123)
# Random forest predictions
y_pred = predict(randomf, newdata = test[-12])
randf_conf <- confusionMatrix(y_pred, test$Churn, mode = "everything", positive = "1")
importance(randomf)
randf_conf

randf_f <- randf_conf$byClass[7]
```

```{r}
set.seed(123)
# Simple decision treee
dtree <- rpart(formula = as.formula(formula), data = train, cp=0.01)
# Choosing optimal complexity parameter (cp) using plot
printcp(dtree)
plotcp(dtree)

```

```{r}
rpart.plot(dtree)
```

```{r}
set.seed(123)
# Devision tree predictions
tree_pred <- predict(dtree, test, type = 'class')

dtree_conf <- confusionMatrix(tree_pred, test$Churn,  mode = "everything", positive = "1")
dtree_conf
dtree_f <- dtree_conf$byClass[7]
```

```{r}
# Preparing data for xgboost algorithm
xgb_train_churn <- as.matrix(train[12])
xgb_train <- data.matrix(train[-12])

xgb_test_churn <- as.matrix(test[12])
xgb_test <- data.matrix(test[-12])

xgb_train_m = xgb.DMatrix(data=xgb_train, label=xgb_train_churn)
xgb_test_m = xgb.DMatrix(data=xgb_test, label=xgb_test_churn)
```

```{r}
params <- list(booster = "gbtree", objective = "binary:logistic", eta=0.3, gamma=0, max_depth=6, min_child_weight=1, subsample=1, colsample_bytree=1)
```

```{r}
# Extracting optimal features for xgboost using cross validation
xgbcv <- xgb.cv( params = params, data = xgb_train_m, nrounds = 100, nfold = 5, showsd = T, stratified = T, print.every.n = 10, early.stop.round = 20, maximize = T)
```

```{r}
# XGBoost algorithm
set.seed(123)
xgb <- xgboost(data = xgb_train_m, params=params, nrounds=41)
```

```{r}
set.seed(123)
# xgboost predictions
xgb_p = predict(xgb, xgb_test_m)
xgb_p = as.factor(round(xgb_p))

xgb_conf <- confusionMatrix(xgb_p, test$Churn,  mode = "everything", positive = "1")
xgb_conf
xgb_f <- xgb_conf$byClass[7]
```

```{r}
# Importance of features
xgb.importance(model=xgb)

# SHaP values for each feature
shap <- shap.prep(xgb_model = xgb, X_train = xgb_train)
shap.plot.summary(shap)
```

```{r}
shap.plot.dependence(shap, "Day.Minutes", alpha = 0.7, jitter_width = 0.1)
shap.plot.dependence(shap, "Day.Minutes", color_feature = 'Evening.Minutes', alpha = 0.7, jitter_width = 0.1)
shap.plot.dependence(shap, "Customer.Service.Calls", color_feature = 'Day.Minutes', alpha = 0.7, jitter_width = 0.1)
```


```{r}
shap.plot.dependence(shap, "International.Minutes", color_feature = 'auto', alpha = 0.7, jitter_width = 0.1)
shap.plot.dependence(shap, "Evening.Minutes", color_feature = 'auto', alpha = 0.7, jitter_width = 0.1)
```


```{r}
set.seed(123)
# Predicting churn with each model
log_pred <- predict(logit, newdata = data_ens, type = 'response')
log_pred <- as.factor(ifelse(log_pred > 0.5,1,0))

svm_pred <- predict(svm_m, newdata = data_ens)

randf_pred <- predict(randomf, newdata = data_ens)

dtree_pred <- predict(dtree, newdata = data_ens, type = 'class')

xgb_matrix <- xgb.DMatrix(data=data.matrix(data_ens[,-12]), label=as.matrix(data_ens$Churn))

xgb_pred <- predict(xgb, newdata = xgb_matrix)
xgb_pred <- as.factor(round(xgb_pred))

# Building the dataframe for ensemble method
predictions <- data.frame(log_pred, svm_pred, randf_pred, dtree_pred, xgb_pred)
predictions$ens <- NA
```

```{r}
# Assigning scores for each model based on respective F-score
sum_f <- sum(logit_f, svm_f, dtree_f, randf_f, xgb_f)
log_score <- (logit_f / sum_f)
svm_score <- (svm_f / sum_f)
randf_score <- (randf_f / sum_f)
dtree_score <- (dtree_f / sum_f)
xgb_score <- (xgb_f / sum_f)

scores <- c(log_score, svm_score, randf_score, dtree_score,xgb_score)
```

```{r}
# Ensemble prediction function
for(i in 1:length(log_pred)){
  y = 0
  n = 0
  for (j in 1:5){
    if (predictions[i, j] == 0){
      n = n + scores[j]
    }else{
      y = y + scores[j]
    }
  }
  if (y > n){
    predictions$ens[i] = 1
  }else{
    predictions$ens[i] = 0
  }
}
```

```{r}
# Measuring the performance of an ensemble
ens_conf <- confusionMatrix(as.factor(predictions$ens), as.factor(data_ens$Churn), mode = 'everything', positive = '1')
ens_conf
ens_f <- ens_conf$byClass[7]

f_scores_all <- c(logit_f, svm_f, dtree_f, randf_f, xgb_f, ens_f)
f_scores_all
```

```{r}
# Min max normalization function
minmax_norm <- function(x) {
  (x - min(x)) / (max(x) - min(x))
}
```

```{r}
# Dataframe consisting only of people who churned
churn_df <- data_ens[data_ens$Churn==1, ]

churn_df <- churn_df[, c("Day.Minutes", "International.Minutes", "Customer.Service.Calls")]

# churn_df <- as.data.frame(lapply(churn_df, minmax_norm))

# churn_df <- churn_df[,!(names(churn_df) %in% c("Plan.Type", "Gender", "Churn"))]

churn_df <- as.data.frame(scale(churn_df))
```

```{r}
# pca_res = prcomp(churn_df, center = TRUE, scale = TRUE)
# summary(pca_res)
```


```{r}
# Visualizing elbow method

wss <- sapply(1:10, function(k){kmeans(churn_df, k, nstart=25,iter.max = 50 )$tot.withinss})

plot(1:10, wss,
     type="b", pch = 19, frame = F,
     xlab="Number of clusters K",
     ylab="Total within-clusters sum of squares")
```

```{r}
# Defining optimal number of k for k-means algorithm
set.seed(123)
# Silhouette method
fviz_nbclust(churn_df, kmeans, nstart = 25, k.max = 15, iter.max = 50, method = "silhouette")+
  labs(subtitle = "Silhouette method")

```

```{r}
# K-means with k=4
k4 <- kmeans(churn_df, centers = 4, iter.max = 50, nstart = 25)
k4
```
```{r}
# Evaluating built clusters with silhouette width
sil <- silhouette(k4$cluster, dist(churn_df))
fviz_silhouette(sil)
```

```{r}
churn_df$cluster = factor(k4$cluster)

p <- plot_ly(churn_df, x=~Day.Minutes, y=~International.Minutes, 
z=~Customer.Service.Calls, color=~cluster) %>% add_markers(size=1.5)
print(p)
```

